import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from plotly.offline import iplot
import plotly as py
import plotly.tools as tls
import cufflinks as cf
import plotly.express as px
import pandas_profiling as pf
py.offline.init_notebook_mode(connected=True)
cf.go_offline()
black_friday_df = pd.read_csv("blackFriday_train.csv")
black_friday_df[:20]
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
| 5 | 1000003 | P00193542 | M | 26-35 | 15 | A | 3 | 0 | 1 | 2.0 | NaN | 15227 |
| 6 | 1000004 | P00184942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 8.0 | 17.0 | 19215 |
| 7 | 1000004 | P00346142 | M | 46-50 | 7 | B | 2 | 1 | 1 | 15.0 | NaN | 15854 |
| 8 | 1000004 | P0097242 | M | 46-50 | 7 | B | 2 | 1 | 1 | 16.0 | NaN | 15686 |
| 9 | 1000005 | P00274942 | M | 26-35 | 20 | A | 1 | 1 | 8 | NaN | NaN | 7871 |
| 10 | 1000005 | P00251242 | M | 26-35 | 20 | A | 1 | 1 | 5 | 11.0 | NaN | 5254 |
| 11 | 1000005 | P00014542 | M | 26-35 | 20 | A | 1 | 1 | 8 | NaN | NaN | 3957 |
| 12 | 1000005 | P00031342 | M | 26-35 | 20 | A | 1 | 1 | 8 | NaN | NaN | 6073 |
| 13 | 1000005 | P00145042 | M | 26-35 | 20 | A | 1 | 1 | 1 | 2.0 | 5.0 | 15665 |
| 14 | 1000006 | P00231342 | F | 51-55 | 9 | A | 1 | 0 | 5 | 8.0 | 14.0 | 5378 |
| 15 | 1000006 | P00190242 | F | 51-55 | 9 | A | 1 | 0 | 4 | 5.0 | NaN | 2079 |
| 16 | 1000006 | P0096642 | F | 51-55 | 9 | A | 1 | 0 | 2 | 3.0 | 4.0 | 13055 |
| 17 | 1000006 | P00058442 | F | 51-55 | 9 | A | 1 | 0 | 5 | 14.0 | NaN | 8851 |
| 18 | 1000007 | P00036842 | M | 36-45 | 1 | B | 1 | 1 | 1 | 14.0 | 16.0 | 11788 |
| 19 | 1000008 | P00249542 | M | 26-35 | 12 | C | 4+ | 1 | 1 | 5.0 | 15.0 | 19614 |
profile = pf.ProfileReport(df=black_friday_df,explorative=True)
profile.to_file(output_file="Profiling_report.html")
/opt/miniconda3/envs/intel/lib/python3.7/site-packages/scipy/stats/stats.py:4812: RuntimeWarning: overflow encountered in long_scalars
profile
They are Males
It is 26-35
fig = px.pie(pd.DataFrame(black_friday_df['Occupation']),values='Occupation',names='Occupation')
fig.show()
black_friday_df['Product_Category_2'] = black_friday_df['Product_Category_2'].fillna(black_friday_df['Product_Category_2'].mode()[0])
black_friday_df['Product_Category_3'] = black_friday_df['Product_Category_3'].fillna(black_friday_df['Product_Category_3'].mode()[0])
black_friday_df[['Product_Category_1']][:50].iplot()
black_friday_df[['Product_Category_2']][:50].iplot()
black_friday_df[['Product_Category_3']][:50].iplot()
black_friday_df[['Product_Category_1']].iplot(kind='hist')
black_friday_df[['Product_Category_2']].iplot(kind='hist')
black_friday_df[['Product_Category_3']].iplot(kind='hist')
black_friday_df[['Product_Category_1','Product_Category_2','Product_Category_3']].describe()
| Product_Category_1 | Product_Category_2 | Product_Category_3 | |
|---|---|---|---|
| count | 550068.000000 | 550068.000000 | 550068.000000 |
| mean | 5.404270 | 9.260768 | 14.989567 |
| std | 3.936211 | 4.294093 | 2.739846 |
| min | 1.000000 | 2.000000 | 3.000000 |
| 25% | 1.000000 | 8.000000 | 16.000000 |
| 50% | 5.000000 | 8.000000 | 16.000000 |
| 75% | 8.000000 | 14.000000 | 16.000000 |
| max | 20.000000 | 18.000000 | 18.000000 |
On an average product 2 was bought most
black_friday_df[['Product_Category_1','Product_Category_2','Product_Category_3']].iplot(kind='box')
import numpy as np
li = []
for i in range(0,black_friday_df.shape[0]):
temp = black_friday_df.iloc[i,-4:-1]
li.append(np.argmax(temp))
black_friday_df[:20]
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | 8.0 | 16.0 | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.0 | 14.0 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | 8.0 | 16.0 | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.0 | 16.0 | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | 8.0 | 16.0 | 7969 |
| 5 | 1000003 | P00193542 | M | 26-35 | 15 | A | 3 | 0 | 1 | 2.0 | 16.0 | 15227 |
| 6 | 1000004 | P00184942 | M | 46-50 | 7 | B | 2 | 1 | 1 | 8.0 | 17.0 | 19215 |
| 7 | 1000004 | P00346142 | M | 46-50 | 7 | B | 2 | 1 | 1 | 15.0 | 16.0 | 15854 |
| 8 | 1000004 | P0097242 | M | 46-50 | 7 | B | 2 | 1 | 1 | 16.0 | 16.0 | 15686 |
| 9 | 1000005 | P00274942 | M | 26-35 | 20 | A | 1 | 1 | 8 | 8.0 | 16.0 | 7871 |
| 10 | 1000005 | P00251242 | M | 26-35 | 20 | A | 1 | 1 | 5 | 11.0 | 16.0 | 5254 |
| 11 | 1000005 | P00014542 | M | 26-35 | 20 | A | 1 | 1 | 8 | 8.0 | 16.0 | 3957 |
| 12 | 1000005 | P00031342 | M | 26-35 | 20 | A | 1 | 1 | 8 | 8.0 | 16.0 | 6073 |
| 13 | 1000005 | P00145042 | M | 26-35 | 20 | A | 1 | 1 | 1 | 2.0 | 5.0 | 15665 |
| 14 | 1000006 | P00231342 | F | 51-55 | 9 | A | 1 | 0 | 5 | 8.0 | 14.0 | 5378 |
| 15 | 1000006 | P00190242 | F | 51-55 | 9 | A | 1 | 0 | 4 | 5.0 | 16.0 | 2079 |
| 16 | 1000006 | P0096642 | F | 51-55 | 9 | A | 1 | 0 | 2 | 3.0 | 4.0 | 13055 |
| 17 | 1000006 | P00058442 | F | 51-55 | 9 | A | 1 | 0 | 5 | 14.0 | 16.0 | 8851 |
| 18 | 1000007 | P00036842 | M | 36-45 | 1 | B | 1 | 1 | 1 | 14.0 | 16.0 | 11788 |
| 19 | 1000008 | P00249542 | M | 26-35 | 12 | C | 4+ | 1 | 1 | 5.0 | 15.0 | 19614 |
black_friday_df['Max_Purchased_prd_type'] = li
black_friday_df_gender_purchase = black_friday_df[['Gender','Max_Purchased_prd_type']]
black_friday_df_gender_purchase_F = black_friday_df_gender_purchase[black_friday_df_gender_purchase['Gender'] == 'F']
black_friday_df_gender_purchase_F['Max_Purchased_prd_type'].value_counts().iplot(kind='bar')
black_friday_df_gender_purchase_M = black_friday_df_gender_purchase[black_friday_df_gender_purchase['Gender'] == 'M']
black_friday_df_gender_purchase_M['Max_Purchased_prd_type'].value_counts().iplot(kind='bar')
black_friday_df[['Age','Max_Purchased_prd_type']].groupby('Age').agg(pd.Series.mode).iplot(kind='bar')
As all the plots are very same, it is difficult to identify difference. Hence, no analysis can be obtained
px.violin(x=black_friday_df['City_Category'],y=black_friday_df["Purchase"],color=black_friday_df['Marital_Status'])
px.violin(x=black_friday_df['City_Category'],y=black_friday_df["Purchase"],color=black_friday_df['Gender'])
px.violin(x=black_friday_df['City_Category'],y=black_friday_df["Purchase"],color=black_friday_df['Age'])
Purchasing of goods of each range of age are almost equal. We can conclude that the percentage of purchasing goods of men over women is higher.
px.box(x=black_friday_df['Age'],y=black_friday_df["Purchase"],color=black_friday_df['Gender'])
black_friday_df['Purchase'].iplot(kind='distplot',mean=black_friday_df['Purchase'].mean())
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-145-d7a12d1a7ce8> in <module> ----> 1 black_friday_df['Purchase'].iplot(kind='distplot',mean=black_friday_df['Purchase'].mean()) /opt/miniconda3/envs/intel/lib/python3.7/site-packages/cufflinks/plotlytools.py in _iplot(self, kind, data, layout, filename, sharing, title, xTitle, yTitle, zTitle, theme, colors, colorscale, fill, width, dash, mode, interpolation, symbol, size, barmode, sortbars, bargap, bargroupgap, bins, histnorm, histfunc, orientation, boxpoints, annotations, keys, bestfit, bestfit_colors, mean, mean_colors, categories, x, y, z, text, gridcolor, zerolinecolor, margin, labels, values, secondary_y, secondary_y_title, subplots, shape, error_x, error_y, error_type, locations, lon, lat, asFrame, asDates, asFigure, asImage, dimensions, asPlot, asUrl, online, **kwargs) 1104 hist_data=self.transpose().values 1105 kw=check_kwargs(kwargs,FF_DISTPLOT) -> 1106 group_labels=kw.pop('group_labels',self.columns) 1107 if histnorm: 1108 kw['histnorm']=histnorm /opt/miniconda3/envs/intel/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name) 5485 ): 5486 return self[name] -> 5487 return object.__getattribute__(self, name) 5488 5489 def __setattr__(self, name: str, value) -> None: AttributeError: 'Series' object has no attribute 'columns'
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(x=black_friday_df['Purchase'])
/opt/miniconda3/envs/intel/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
<AxesSubplot:ylabel='Density'>
black_friday_df['Purchase'].mean()
9263.968712959126
black_friday_df['Purchase'].median()
8047.0